#Load packages
library(data.table)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.8 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:data.table’:
between, first, last
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(gridExtra)
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
library(lmtest)
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
library(maps)
library(mapdata)
#Add data GPS
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")
head(country_data)
Number of courtries
# Determine the number of different countries
number_of_countries <- length(unique(gps_data$country))
# Display the number of different countries
number_of_countries
[1] 76
Clean the data from missing values
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
drop_na(country, isocode, risktaking, gender, age)
# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))
# Display the cleaned data
cleaned_data
# Display the number of records removed per variable
records_removed_per_variable
country isocode ison region language date
0 0 0 0 0 0
id_gallup wgt patience risktaking posrecip negrecip
0 0 190 634 32 247
altruism trust subj_math_skills gender age
74 163 132 0 276
List of all variable
# List all variables
variable_list <- names(cleaned_data)
# Display the list of variables
print(variable_list)
[1] "country" "isocode" "ison" "region" "language"
[6] "date" "id_gallup" "wgt" "patience" "risktaking"
[11] "posrecip" "negrecip" "altruism" "trust" "subj_math_skills"
[16] "gender" "age"
#select only the variables of interest
cleaned_data <- cleaned_data %>%
select(country, isocode, ison, risktaking, gender, age)
cleaned_data
Calculate Z-Score for Age per Country
Create a new column “age_group” with the age categories
cleaned_data$agecat <- cut(
cleaned_data$age,
breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
right = FALSE # Left end (inclusive), right end (exclusive)
)
# Display the new column
head(cleaned_data)
########## ########## ########## Add
Hardship-list
########## ########## ########## ########## Get the
hardship facts together!!!!
# Lade GDP-Daten aus der CSV-Datei
gdp_data <- read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/GDP_per_capita.csv")
# Merge der Daten basierend auf der gemeinsamen Variable "country" und "name"
cleaned_data <- merge(cleaned_data, gdp_data, by.x = "country", by.y = "name", all.x = TRUE)
# Anzeige der gemergten Daten
head(cleaned_data)
#select only the variables of interest
cleaned_data <- cleaned_data %>%
select(country, isocode, ison, risktaking, gender, age, z_score_age, agecat, value, date_of_information, ranking, region)
cleaned_data
Descriptive
Number of courtries after data cleaning
# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))
# Display the number of different countries
number_of_countries
[1] 76
#CODE FOR THE PRESENTATION ########### ########## ##########
Worldmap of the recorded countries
world_map <- map_data("world") # Create a world map with country borders
recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data
world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not
# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
geom_polygon(color = "white") +
scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "none") + # Set guide to "none" to remove the legend
theme_void() +
labs(title = "GPS", fill = "Status") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) # Remove legend and center the title

–> Add mean gender
–> add new columns to the datasetv –> put it to the data
preparation
Summarize data across all regions
summary_by_region <- cleaned_data %>%
group_by(region) %>%
summarize(
mean_age = mean(age, na.rm = TRUE),
mean_risk = mean(risktaking, na.rm = TRUE),
total_participants = n()
)
# Display the summarized data
print(summary_by_region)
All about the age
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)
# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)
# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)
# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
Age Range: 15 to 99
cat("Average Age: ", average_age, "\n")
Average Age: 41.73605
cat("Median Age: ", median_age, "\n")
Median Age: 40
Number of participants in each age category
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)
# Display the number of participants in the age categories
print(agecat_counts)
15-19 20-29 30-39 40-49 50-59 60-69 70-79 80+
6888 16872 15905 13583 11374 8570 4688 1559
Boxplot of Age Distribution per Country
ggplot(cleaned_data, aes(x = age)) +
geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
theme_minimal()

ggplot(cleaned_data, aes(x = country, y = age)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(cleaned_data, aes(x = country, y = risktaking)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Distribution of Age per Country", x = "Country", y = "Risktaking") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

All about the gender
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)
# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females: 36024
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males: 43415
Table with country and gender
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)
# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)
# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)
# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
country = rownames(gender_by_country),
total_participants = total_participants,
female = gender_by_country[, "Female"],
male = gender_by_country[, "Male"]
)
# Display the result table
result_table
Analysis of item risktaking
# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)
# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
theme_minimal()

# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
geom_boxplot() +
labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
theme_minimal() +
guides(fill = FALSE)
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.

# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
geom_point(size = 0.2) +
geom_smooth(method = "lm")

# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
geom_boxplot() +
labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
theme_minimal() +
guides(fill = FALSE)

# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
geom_boxplot(fill = "lightblue", color = "blue") +
labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Change backround change x-Achse acording to the info from the
presentation
Risk vs age with color-coded gender per Country
ggplot(cleaned_data, aes(z_score_age, risktaking, color = factor(gender))) +
geom_point(size = 0.1) +
geom_smooth(method = "lm") +
scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
labs(color = "Gender")

Laura has to rewrite the code –> Table already included with the
lm
# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age + gender, data = cleaned_data)
# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]
summary(model)
Call:
lm(formula = risktaking ~ age * gender, data = cleaned_data)
Residuals:
Min 1Q Median 3Q Max
-2.3288 -0.6770 -0.0426 0.6446 3.0662
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.6413392 0.0131258 48.861 < 2e-16 ***
age -0.0124875 0.0002925 -42.692 < 2e-16 ***
gender -0.1285097 0.0178148 -7.214 5.50e-13 ***
age:gender -0.0021551 0.0003944 -5.464 4.66e-08 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9644 on 79435 degrees of freedom
Multiple R-squared: 0.07001, Adjusted R-squared: 0.06998
F-statistic: 1993 on 3 and 79435 DF, p-value: < 2.2e-16
Laura has to rewrite the code –> Table already included with the
lm
analyze the results by using “Balkendiagramm”
what does 0 mean? // centered age = mean age of the sample
ggplot(data = regression_results, aes(x = country, y = intercept)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
View the countries with intercept values over 0.75
high_intercept_countries <- subset(regression_results, intercept > 0.75)
# View the countries with intercept values over 0.75
print(high_intercept_countries)
Scatterplots for ‘high_intercept_countries’ contains the top 17
countries
# Create scatterplots with regression lines for countries with intercept > 0.75
plots <- lapply(high_intercept_countries$country, function(country) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking, color = factor(gender))) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
color = "Gender") +
scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")
Scatterplots for ‘high_intercept_countries’ contains the top 17
countries
plots <- lapply(high_intercept_countries$country, function(country) {
p <- ggplot(subset(cleaned_data, country == country), aes(x = ranking.x, y = risktaking, color = factor(gender))) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
color = "Gender") +
scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")
Create scatterplots with regression lines for individual
countries
# Examples for countries
regression_results <- data.frame(
country = c("Algeria", "Argentina", "Austria"),
intercept = c(0.92053422, 0.51698822, 0.42606684),
slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02 )
)
# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
country <- regression_results$country[i]
intercept <- regression_results$intercept[i]
slope <- regression_results$slope[i]
r_squared <- regression_results$r_squared[i]
p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
labs(title = paste("Linear Regression for", country),
subtitle = paste("Intercept:", round(intercept, 2),
"Slope:", round(slope, 2),
"R-squared:", round(r_squared, 2)))
print(p)
})
# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
for (i in seq_along(plots)) {
filename <- paste0("plot_", i, ".png")
ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}
# Switch back to the original working directory
setwd("..")
print("Individual plots are saved in the 'individual_country_plots' directory.")
Scatterplot with include both individual country regression lines
and an overall regression line
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)
# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
geom_point(size = 0.5) + # Adjust point size
geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2],
color = "black", size = 1) + # Add the overall regression line in red
labs(title = "Scatterplot with Regression Lines for risktaking and age by Country",
x = "Age", y = "Risktaking") +
theme(legend.position = "bottom", # Move the legend below the graph
legend.key.size = unit(0.1, "in")) # Adjust the size of the legend key
Combined List of all 3 files
Prepare Wave 5
# Load the data
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds") # Data of Wave 5
#rename the variables
WV5_data <- WV5_data_df %>%
rename(sex = V235, age = V237, country = V2, wave = V1, risktaking = V86)
WV5_data
#select only the variables of interest
WV5_data <- WV5_data %>%
select(sex, age, country, wave, risktaking)
WV5_data
#decode the country names
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
WV5_data
Prepare Wave 6
#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata")
WV6_data <- WV6_Data_R_v20201117
print(WV6_data)
#rename variables in Wave 6
WV6_data <- WV6_data %>%
rename(wave = V1, sex = V240, age = V242,country = V2, risktaking = V76)
#select only the variables of interest
WV6_data <- WV6_data %>%
select(wave, sex, age, country, sex,risktaking)
WV6_data
#decode daraset (Wave 6)
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
WV6_data
Prepare GPS dataset (cleaned_data)
# Rename "Country" in "country_lab"
print(colnames(cleaned_data))
#combine the 2 dataset (Wave 6 + Wave 5)
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data
#number of countries
length(unique(data$country_lab))
Merge the datasets based on common variables
# Find common countries between the two datasets
common_countries <- intersect(data$country_lab, cleaned_data$country)
# Display the common countries
print(common_countries)
Find countries in “data” but not in “cleaned_data”
countries_only_in_data <- setdiff(data$country_lab, cleaned_data$country)
# Find countries in "cleaned_data" but not in "data"
countries_only_in_cleaned_data <- setdiff(cleaned_data$country, data$country_lab)
# Display the results
cat("Countries only in 'data':", countries_only_in_data, "\n")
cat("Countries only in 'cleaned_data':", countries_only_in_cleaned_data, "\n")
---
title: "R Notebook"
output: html_notebook
---

#Load packages
```{r}
library(data.table)
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(lmtest)
library(maps)
library(mapdata)
```

#Add data GPS
```{r}
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")

head(country_data)
```
# Number of courtries 
```{r}
# Determine the number of different countries
number_of_countries <- length(unique(gps_data$country))

# Display the number of different countries
number_of_countries
```

# Clean the data from missing values
```{r}
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, isocode, risktaking, gender, age)

# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))

# Display the cleaned data
cleaned_data

# Display the number of records removed per variable
records_removed_per_variable
```

# List of all variable
```{r}
# List all variables
variable_list <- names(cleaned_data) 

# Display the list of variables
print(variable_list)
```
#select only the variables of interest
```{r}
cleaned_data <- cleaned_data %>%
  select(country, isocode, ison, risktaking, gender, age)
cleaned_data
```

############################# Calculate Z-Score for Age per Country
############## Median = 40 // SD = 15 
```{r}
cleaned_data <- cleaned_data %>%
  group_by(country) %>%
  mutate(z_score_age = scale(age))

# Display the new column with Z-Scores per Country
head(cleaned_data)
```

# Create a new column "age_group" with the age categories
```{r}
cleaned_data$agecat <- cut(
  cleaned_data$age,
  breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
  labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
  right = FALSE # Left end (inclusive), right end (exclusive)
)

# Display the new column
head(cleaned_data)
```

########## ########## ########## ########## Add Hardship-list
########## ########## ########## ########## ########## Get the hardship facts together!!!!
```{r}
# Lade GDP-Daten aus der CSV-Datei
gdp_data <- read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/GDP_per_capita.csv")

# Merge der Daten basierend auf der gemeinsamen Variable "country" und "name"
cleaned_data <- merge(cleaned_data, gdp_data, by.x = "country", by.y = "name", all.x = TRUE)

# Anzeige der gemergten Daten
head(cleaned_data)
```
#select only the variables of interest
```{r}
cleaned_data <- cleaned_data %>%
  select(country, isocode, ison, risktaking, gender, age, z_score_age, agecat, value, date_of_information, ranking, region)
cleaned_data
```

# Extrahiere die Spalte "ranking" aus dem gemergten Datenrahmen
```{r}
# Anzahl der Teilnehmenden pro Ranking und Land
participants_by_ranking <- cleaned_data %>%
  group_by(ranking, country) %>%
  summarize(participant_count = n())

# Anzeige der Liste
print(participants_by_ranking)

write.xlsx(participants_by_country, "Teilnehmer_nach_Land.xlsx", row.names = FALSE)

# Pfad, wo die Excel-Datei gespeichert werden soll
excel_path <- "/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/"

# Excel-Datei erstellen
write.xlsx(participants_by_country, excel_path, row.names = FALSE)
```

#######################################################################################################################
#######################################################################################################################
# Descriptive 

# Number of courtries after data cleaning
```{r}
# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))

# Display the number of different countries
number_of_countries
```

###########
##########
##########
#CODE FOR THE PRESENTATION 
###########
##########
##########

# Worldmap of the recorded countries
```{r}
world_map <- map_data("world") # Create a world map with country borders

recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data

world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not

# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
  geom_polygon(color = "white") +
  scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "none") +  # Set guide to "none" to remove the legend
  theme_void() +
  labs(title = "GPS", fill = "Status") +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))  # Remove legend and center the title

```
###########
########## --> Add mean gender 
--> add new columns to the datasetv 
--> put it to the data preparation 

# Summarize data across all regions
```{r}
summary_by_region <- cleaned_data %>%
  group_by(region) %>%
  summarize(
    mean_age = mean(age, na.rm = TRUE),
    mean_risk = mean(risktaking, na.rm = TRUE),
    total_participants = n()
  )

# Display the summarized data
print(summary_by_region)
```

# All about the age
```{r}
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)

# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)

# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)

# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
cat("Average Age: ", average_age, "\n")
cat("Median Age: ", median_age, "\n")
```

# Number of participants in each age category
```{r}
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)

# Display the number of participants in the age categories
print(agecat_counts)
```

# Boxplot of Age Distribution per Country 
```{r}
ggplot(cleaned_data, aes(x = age)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
  labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
  theme_minimal()

ggplot(cleaned_data, aes(x = country, y = age)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Risktaking") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```


# All about the gender
```{r}
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)

# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
cat("Number of Males: ", gender_counts[2], "\n")
```

# Table with country and gender
```{r}
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)

# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)

# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)

# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
  country = rownames(gender_by_country),
  total_participants = total_participants,
  female = gender_by_country[, "Female"],
  male = gender_by_country[, "Male"]
)

# Display the result table
result_table
```
# Analysis of item risktaking 
```{r}
# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)

# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
  labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
  theme_minimal()

# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
  scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
  theme_minimal() +
  guides(fill = FALSE)

# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
  geom_point(size = 0.2) +  
  geom_smooth(method = "lm")

# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
                               labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
  theme_minimal() +
  guides(fill = FALSE)

# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```

###########
###########
###########
Change backround 
change x-Achse acording to the info from the presentation

# Risk vs age with color-coded gender per Country
```{r}
ggplot(cleaned_data, aes(z_score_age, risktaking, color = factor(gender))) +
  geom_point(size = 0.1) +  
  geom_smooth(method = "lm") +
  scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
  labs(color = "Gender")
```


################################################################################################################################################################################################################################################


###########
###########
###########
Laura has to rewrite the code --> Table already included with the lm

```{r}
# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age + gender, data = cleaned_data)

# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]

summary(model)
```

###########
###########
###########
Laura has to rewrite the code --> Table already included with the lm


# Create a table with the following information: country, isocode, n (count of participants), female percentage (%), mean age, age range, and risktaking

### Four loup 
Group by country
mutate intercept 
run and save intercept model 
```{r}
# Group the data by country
table_data <- gps_data %>%
  group_by(country, isocode) %>%
  summarize(
    n = n(),
    female_percentage = mean(gender == 1) * 100,
    mean_age = mean(age, na.rm = TRUE),
    age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
    mean_risktaking = mean(risktaking, na.rm = TRUE)
  )

# Intercept and slope for age
intercept_age <- 0.5759701
slope_age <- -0.0137902

# Intercept and slope for gender
intercept_gender <- 0.124666 
slope_gender <- -0.227338 

# Add the intercept and slope information to the table
table_data <- table_data %>%
  mutate(
    intercept_age = intercept_age,
    slope_age = slope_age,
    intercept_gender = intercept_gender,
    slope_gender = slope_gender,
  )

# Display the updated table
table_data
```


#############################################
#############################################



#Preparing for linear regression
```{r}
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)

# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")

```


```{r}
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, risktaking, age)

# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
  group_by(country) %>%
  do(model = lm(risktaking ~ age, data = .)) %>%
  summarize(
    country = first(country),
    intercept = summary(model)$coefficients[1],
    slope = summary(model)$coefficients[2],
    r_squared = summary(model)$r.squared
  )

# Display regression results for each country
print(regression_results)
```


# analyze the results by using "Balkendiagramm"
###### what does 0 mean? // centered age = mean age of the sample
```{r}
ggplot(data = regression_results, aes(x = country, y = intercept)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
# View the countries with intercept values over 0.75
```{r}
high_intercept_countries <- subset(regression_results, intercept > 0.75)

# View the countries with intercept values over 0.75
print(high_intercept_countries)
```


# Scatterplots for 'high_intercept_countries' contains the top 17 countries
```{r}
# Create scatterplots with regression lines for countries with intercept > 0.75
plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")

```

# Scatterplots for 'high_intercept_countries' contains the top 17 countries
```{r}
plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = ranking.x, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")
```

# Create scatterplots with regression lines for individual countries
```{r}
# Examples for countries
regression_results <- data.frame(
  country = c("Algeria", "Argentina", "Austria"),
  intercept = c(0.92053422, 0.51698822, 0.42606684),
  slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
  r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02	)
)

# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
  country <- regression_results$country[i]
  intercept <- regression_results$intercept[i]
  slope <- regression_results$slope[i]
  r_squared <- regression_results$r_squared[i]
  
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(intercept, 2),
                           "Slope:", round(slope, 2),
                           "R-squared:", round(r_squared, 2)))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots are saved in the 'individual_country_plots' directory.")
```

# Scatterplot with include both individual country regression lines and an overall regression line
```{r}
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)

# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
  geom_point(size = 0.5) +                      # Adjust point size
  geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
  geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2], 
              color = "black", size = 1) + # Add the overall regression line in red
  labs(title = "Scatterplot with Regression Lines for risktaking and age by Country", 
       x = "Age", y = "Risktaking") +
  theme(legend.position = "bottom",          # Move the legend below the graph
        legend.key.size = unit(0.1, "in"))  # Adjust the size of the legend key
```



#################################################################################################################################################################
# Combined List of all 3 files

# Prepare Wave 5
```{r}
# Load the data
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds") # Data of Wave 5

#rename the variables
WV5_data <- WV5_data_df %>%
  rename(sex = V235, age = V237, country = V2, wave = V1, risktaking = V86)
WV5_data

#select only the variables of interest
WV5_data <- WV5_data %>%
  select(sex, age, country, wave, risktaking)
WV5_data

#decode the country names 
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
WV5_data
```

# Prepare Wave 6
```{r}
#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata") 
WV6_data <- WV6_Data_R_v20201117 
print(WV6_data)
```

#rename variables in Wave 6
```{r}
WV6_data <- WV6_data %>%
  rename(wave = V1, sex = V240, age = V242,country = V2, risktaking = V76)

#select only the variables of interest
WV6_data <- WV6_data %>%
  select(wave, sex, age, country, sex,risktaking)
WV6_data
```
#decode daraset (Wave 6)
```{r}
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
WV6_data
```
# Prepare GPS dataset (cleaned_data)
```{r}
# Rename "Country" in "country_lab"
print(colnames(cleaned_data))
```

#combine the 2 dataset (Wave 6 + Wave 5)
```{r}
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data

#number of countries

length(unique(data$country_lab))
```


# Merge the datasets based on common variables
```{r}
# Find common countries between the two datasets
common_countries <- intersect(data$country_lab, cleaned_data$country)

# Display the common countries
print(common_countries)

```

# Find countries in "data" but not in "cleaned_data"
```{r}
countries_only_in_data <- setdiff(data$country_lab, cleaned_data$country)

# Find countries in "cleaned_data" but not in "data"
countries_only_in_cleaned_data <- setdiff(cleaned_data$country, data$country_lab)

# Display the results
cat("Countries only in 'data':", countries_only_in_data, "\n")
cat("Countries only in 'cleaned_data':", countries_only_in_cleaned_data, "\n")
```



